##%%%%%%%%%%%%%%%%%%%%%%%% ANALdeming.R %%%%%%%%%%%%%%%%%%%%%%%%%%%
##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## In this script we check for overfitting using the original
## Deming 2009 paper.

library(Matrix)
library(MASS) ## For ginv() - Moore penrose inverse
library(ggplot2)
library(stringr) ## for string manipulation
## library(DAAG)
library(foreign)  ## For Deming Files

##%%%%%%%%%%%%%%%% Set Working Directory %%%%%%%%%%%%

setwd("D:/RWORK/Repository Files")

##%%%%%%%%%%%%%%%%%%% Load functions %%%%%%%%%%%%%%%%

source("D:/RWORK/Repository Files/FUNC.R")



##%%%%% Evaluation of Deming's Paper using CV and formula based shrinkage %%%%%%%%%%%%%%
##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## Get original Deming data (using commands from "demingRobustInf.R")

## Load strata file (*.dta) to create data frame
rtV= read.dta("regressorAndTargetsN.dta")
Noncog=read.dta("NoncogStd.dta")

rtVM=as.matrix(rtV)
NoncogS=as.matrix(Noncog[,"Noncog_std"])
rtVM=cbind(rtVM,NoncogS)

rm(Noncog)
rm(NoncogS)

## Write out the variable names to a file where we can pick
## those out used as Demings targets and regressors
## cc=colnames(rtVM)
## write(cc,"cc.txt")
 
## Check Table-1 variables
tab1V=c("PreK","PreK_FE","Black","NonBlack","PermInc","MomDropout","MomSomeColl","AgeAFQT_std","HighGrade_GMom79")
tab1M=rtVM[,tab1V]

## Index of rows without missing variables for PreK (3698 examples)
## this is the same number as in the paper.
jjL=Rsum(!is.na(tab1M[,c(1,3:ncol(tab1M))]))==(ncol(tab1M)-1)
countL=sum(jjL)

## Index of rows without missing variables for PreK_FE (1663 examples)
## this is the same number as in the paper.
jjS=Rsum(!is.na(tab1M[,c(2,3:ncol(tab1M))]))==(ncol(tab1M)-1)
countS=sum(jjS)


## Index used in fixed effects regression (1251 examples)
jjFE=!is.na(rtVM[,"Sample90_2"])
countFE=sum(jjFE)


## Collect Deming Covariate variable names
## HS2_FE90 Pre2_FE90 Male Black NonBlack White Hispanic
cc=colnames(rtVM)
last4=substring(cc,nchar(cc)-3,nchar(cc))
ii=last4=="_imp" | last4=="miss"
ccx=cc[ii]
ccy=c("HS2_FE90","Pre2_FE90","Male")
ccz=paste("_IAge2_Yr10_",1:33,sep="")
ccV=c(ccy,ccx,ccz)

## c1=c("Res_0to3","HealthCond_before","logBW","LogInc_0to3","LogIncAt3")
## c2=c("FirstBorn","Male","Age2_Yr104","HOME_Pct_0to3","Moth_HrsWorked_BefBirth")
## c3=c("Moth_HrsWorked_0to1","Father_HH_0to3","GMom_0to3","MomCare","RelCare","NonRelCare")
## c4=c("Moth_Smoke_BefBirth","Alc_BefBirth","Breastfed","Doctor_0to3","Dentist_0to3")
## c5=c("Moth_WeightChange","Illness_1stYr","Premature","Insurance_0to3","Medicaid_0to3")
## HS_5to6 HS_7to10 HS_11to14 Pre_5to6 Pre_7to10 Pre_11to14
## ccy=c(c1,c2,c3,c4,c5)

## Create matrix of unabhangig covariates
covM=rtVM[,ccV]
covMC=covM[,ccx]
covMA=covM[,ccz]
covMHS=covM[,ccy]


## Find group of covariates that are not collinear on the FE dataset
t1=rtVM[jjFE,"Sum_Adult"]
cov1=rtVM[jjFE,ccV]

## Use lm to find dropped variables (these will have NA coefficients in regression)
foo=lm(t1 ~ cov1)
iiS=!is.na(foo$coefficients[2:length(foo$coefficients)])
ccVS=ccV[iiS]


## Test this
cov1S=rtVM[jjFE,ccVS]
fooS=lm(t1 ~ cov1S)
coef1=foo$coefficients[2:length(foo$coefficients)][iiS]
coef1S=fooS$coefficients[2:length(fooS$coefficients)]
test=coef1-coef1S

## > sum(test!=0)
## [1] 0

## Collect Deming Target variable names
## Noncog Noncog_std Sum_Adult Sum_wage (Sum_Adult and Sum_wage don't have "_std" versions)
## LD HSGrad HSGrad_GED someCollAtt Idle Crime TeenPreg PoorHealth
## LD Repeat HSGrad someCollAtt Idle Crime Prison TeenPreg PoorHealth HSGrad_GED
## all of these with "_std"
cct1=c("Noncog_std","Sum_Adult","Sum_wage")
cct2=c("LD","Repeat","HSGrad","someCollAtt","Idle","Crime","Prison","TeenPreg","PoorHealth","HSGrad_GED")
cct3=paste(cct2,"_std",sep="")
ccT=c(cct1,cct2,cct3)

## Create Matrices of Target Variables
tarM=rtVM[,ccT]
tarM1=rtVM[,cct1]
tarM2=rtVM[,cct2]
tarM3=rtVM[,cct3]

##%%%%%%%%%%%%%%%%%% !!!! NOTE !!! %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## Adult outcomes are "Sum_Adult" and young adult (non-test)
## outcomes are in "Noncog_std"


## Create matrix of cluster dummy variables
momID=rtVM[,"MotherID"]
momIDN=j2z(momID)
jjmom=!is.na(momID)
momIDU=unique(momID[jjmom])
clusID=as.character(momIDU)
chID=rtVM[,"ChildID"]

clusDM=matrix(0,nrow(rtVM),length(momIDU))
colnames(clusDM)=clusID

for(i in 1:length(momIDU)){jj=(momIDN==momIDU[i]);
    clusDM[jj,clusID[i]]=1}

## Look for those clusters that will be relevant for the regression
ll=Csum(clusDM[jjL,])>0
clL=clusID[ll]
ll=Csum(clusDM[jjS,])>0
clS=clusID[ll]
ll=Csum(clusDM[jjFE,])>0
clFE=clusID[ll]



##%%%%%%%%%%%%%%%%%% Fixed Effects Regressions %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## Get fixed effects matrix, then put it together with covariates matrix
## NOTE!!! There are 136 of the kids here who singletons.
## This is the reason that N that we report is different 
## Than Deming's N.
clusFE=clusDM[jjFE,clFE]
xx_=cbind(cov1S,clusFE)

## Use as target the collected adult target
t1=rtVM[jjFE,"Sum_Adult"]
t2=rtVM[jjFE,"Noncog_std"]
##%%%%%%%%%% Model with everything %%%%%%%%%%%%%%%
## Note that this gives us an exact match with coefficients from Deming
## model as computed by stata.
fitFE=lm(t1~xx_)
sumfitFE=summary(fitFE)
View(sumfitFE[[4]])

## Compute this without singletons
lls=Csum(clusFE)>1
clFEs=clusID[lls]
iis=Rsum(clusFE[,lls])>0

## Create targets and covariates for these
xxs_=cbind(cov1S[iis,],clusFE[iis,lls])
t1s=t1[iis]
t2s=t2[iis]
##%%%%%%%%%%%%%%%% !!!NOTE!!! %%%%%%%%%%%%%%%%%%
## The covariate group xxs_ is the one used in all 
## the Deming models in his Table 4. It is also
## the one used in column 5 of table 3. 
## Column 4 of table 3 is a model with just head start
## other kinder garden and fixed effects dummies. 

## > dim(cov1S[iis,])
## [1] 1115   64
## > dim(clusFE[iis,lls])
## [1] 1115  438
## > dim(xxs_)
## [1] 1115  502

##%%%%%%%%%%%%%%%%% Fit models %%%%%%%%%%%%%%%%
## Fits of the full model of sumAdult and youngAdult
Fit1=lm(t1s~xxs_)
Fit2=lm(t2s~xxs_)

## Get summaries of these models
sumFit1=summary(Fit1)
sumFit2=summary(Fit2)

## > sumFit1$r.squared
## [1] 0.65587
## > sumFit2$r.squared
## [1] 0.5803242

##%%%%% Define a function to compute all the relevant rsq, cv.rsq, N, p etc. %%%
##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
crossValA=function(taR,inS){
	Fit=lm(taR~inS)
	sumFit=summary(Fit)
	coeF=sumFit$coefficients
	Fit.p=sum(!is.na(coeF[2:nrow(coeF),"Estimate"]))
	Fit.N=nrow(inS)
	Fit.r.squared=sumFit$r.squared
	Fit.cv=crossValD1S(taR,inS)[1:3,]
	Fit.LN=rsqOutHatLN(Fit.r.squared,Fit.N,Fit.p)
	Fit.DS=rsqOutHatDS(Fit.r.squared,Fit.N,Fit.p)
	Fit.R=rsqOutHatR(Fit.r.squared,Fit.N,Fit.p)
	## Collect all of these
	ouT=matrix(c(Fit.N,Fit.p,Fit.cv,Fit.LN,Fit.DS,Fit.R),8,1)
	ouT[1:2,]=round(ouT[1:2,])
	colnames(ouT)="Fit"
	rownames(ouT)=c("N","p","r.squared","adj.r.squared","cv.r.squared", 
				       "LN.r.squared","DS.r.squared","R.r.squared")
	ouT}


## Function to compute all but cv.r.squared using just r.squared, N, p
crossValAnoCV=function(rsq,N,p){
	Fit.p=p
	Fit.N=N
	Fit.adj.r.squared=adjRsq(rsq,N,p)
	Fit.cv=NA
	Fit.LN=rsqOutHatLN(rsq,Fit.N,Fit.p)
	Fit.DS=rsqOutHatDS(rsq,Fit.N,Fit.p)
	Fit.R=rsqOutHatR(rsq,Fit.N,Fit.p)
	ouT=matrix(c(Fit.N,Fit.p,rsq,Fit.adj.r.squared,NA,Fit.LN,Fit.DS,Fit.R),8,1)
	ouT[1:2,]=round(ouT[1:2,])
	colnames(ouT)="Fit"
	rownames(ouT)=c("N","p","r.squared","adj.r.squared","cv.r.squared", 
				       "LN.r.squared","DS.r.squared","R.r.squared")
	ouT}


##%%%%%%%%%%%%%%%% Compute rsqOutM for Deming models using crossValA %%%%%%%%%%%%%%%% 

rsqOut1=crossValA(t1s,xxs_)
rsqOut2=crossValA(t2s,xxs_)


## Do Table 3 stats
## from Deming Table 3
Tab3stats=matrix(0,3,2)
colnames(Tab3stats)=c("col4","col5")
rownames(Tab3stats)=c("r.squared","N","p")

Tab3stats[,"col4"]=c(0.608,1115,439)
Tab3stats[,"col5"]=c(0.619,1115,500)

## Use crossValAnoCV 
rsqOutTab3col4=crossValAnoCV(Tab3stats[1,"col4"],Tab3stats[2,"col4"],Tab3stats[3,"col4"])
rsqOutTab3col5=crossValAnoCV(Tab3stats[1,"col5"],Tab3stats[2,"col5"],Tab3stats[3,"col5"])


## Put this all together
rsqOutDem=cbind(rsqOut1,rsqOut2,rsqOut1S,rsqOut2S,rsqOutTab3col4,rsqOutTab3col5)
colnames(rsqOutDem)=c("Fit1","Fit2","Fit1S","Fit2S","Tab3col4","Tab3col5")

## > round(rsqOutDem,3)
##                   Fit1     Fit2    Fit1S    Fit2S Tab3col4 Tab3col5
## N             1115.000 1115.000 1115.000 1115.000 1115.000 1115.000
## p              500.000  500.000  439.000  439.000  439.000  500.000
## r.squared        0.656    0.580    0.522    0.522    0.608    0.619
## adj.r.squared    0.376    0.239    0.212    0.212    0.353    0.309
## cv.r.squared    -0.037   -0.254   -0.275   -0.275       NA       NA
## LN.r.squared     0.095   -0.104   -0.099   -0.099    0.098   -0.002
## DS.r.squared    -0.135   -0.384   -0.303   -0.303   -0.069   -0.256
## R.r.squared      0.096   -0.102   -0.098   -0.098    0.099   -0.001



##%%%%%%%%%%%% Get Deming Table 3 and 4 Coef and Std. Error %%%%%%%%%%
## These numbers are from the paper itself.

colNames=c("5-6","7-10","11-14","5-14","Nontest Score","Long Term")
rowNames=c("Overall","Black","NonBlack","Male","Female","lowAFQT","nonLowAFQT")

OverallHScoef =c(0.145,0.133,0.055,0.101,0.265,0.228)
OverallHSstderr=c(0.085,0.060,0.062,0.057,0.082,0.072)
BlackHScoef=c(0.287,0.127,0.031,0.107,0.351,0.237)
BlackHSstderr=c(0.095,0.075,0.076,0.072,0.120,0.103)
NonBlackHScoef=c(-0.057,0.11,0.156,0.110,0.177,0.224)
NonBlackHSstderr=c(0.120,0.092,0.095,0.090,0.111,0.102)
MaleHScoef=c(0.154,0.181,0.141,0.159,0.390,0.182)
MaleHSstderr=c(0.107,0.079,0.081,0.076,0.123,0.103)
FemaleHScoef=c(0.128,0.059,0.033,0.055,0.146,0.272)
FemaleHSstderr=c(0.106,0.083,0.085,0.081,0.108,0.106)
lowAfqtHScoef=c(0.171,0.016,-0.023,0.015,0.529,0.279)
lowAfqtHSstderr=c(0.129,0.095,0.102,0.094,0.156,0.114)
nonLowAfqtHScoef=c(0.133,0.172,0.144,0.154,0.124,0.202)
nonLowAfqtHSstderr=c(0.094,0.073,0.074,0.071,0.091,0.091)

tab4coefM=rbind(OverallHScoef,BlackHScoef,NonBlackHScoef,
			MaleHScoef,FemaleHScoef,lowAfqtHScoef,nonLowAfqtHScoef)
colnames(tab4coefM)=colNames
rownames(tab4coefM)=rowNames

tab4stderrM=rbind(OverallHSstderr,BlackHSstderr,NonBlackHSstderr,
			MaleHSstderr,FemaleHSstderr,lowAfqtHSstderr,nonLowAfqtHSstderr)

colnames(tab4stderrM)=colNames
rownames(tab4stderrM)=rowNames

## tstat matrix
tab4tstatM=round(abs(tab4coefM)/tab4stderrM,3)

## > tab4tstatM
##              5-6  7-10 11-14  5-14 Nontest Score Long Term
## Overall    1.706 2.217 0.887 1.772         3.232     3.167
## Black      3.021 1.693 0.408 1.486         2.925     2.301
## NonBlack   0.475 1.196 1.642 1.222         1.595     2.196
## Male       1.439 2.291 1.741 2.092         3.171     1.767
## Female     1.208 0.711 0.388 0.679         1.352     2.566
## lowAFQT    1.326 0.168 0.225 0.160         3.391     2.447
## nonLowAFQT 1.415 2.356 1.946 2.169         1.363     2.220
##
## As we can see here, the largest tstat is for lowAFQT/Nontest Score
## which is 3.391<3.5. 
## Further the largest of any of our estimates of shrinkR for
## any model was 0.175 for R.r.squared on Fit1.Nbl (Long Term/NonBlack)
## Since this is less than 0.2, the best average shrinkFst that 
## one can expect is 0.05.  Thus, the most hopeful shrinkage one can estimate
## of shrinkTstat=sqrt(0.05)=0.224 (2.5<tstat<=3.5, 0.1<shrinkR<=0.2)
##
## > sqrt(0.05)*3.391
## [1] 0.7582507
## Since most of the coefficients
##
## Although we didn't explicitly replicate the test score models
## the estimates we have for "Overall" from come from the formula
## based estimates from tab3col5 statistics were all negative. 
## Since the conditional models are generally similar or 
## worse than the "Overall" model, it is doubtful that the results
## from the conditional test score models would be particularly reliable.

## Table-3 coef, stderr
rowNms=c("tab3col4","tab3col5")
colNms=c("HS 5-6","HS 7-10","HS 11-14","otherPreSch 5-6","otherPreSch 7-10","otherPreSch 11-14")
tab3col4coef=c(0.131,0.0116,0.029,-0.102,0.031,-0.040)
tab3col4stderr=c(0.087,0.060,0.061,0.084,0.061,0.066)
tab3col5coef=c(0.145,0.133,0.055,-0.079,0.048,-0.022)
tab3col5stderr=c(0.085,0.060,0.062,0.085,0.065,0.069)

tab3coefM=rbind(tab3col4coef,tab3col5coef)
rownames(tab3coefM)=rowNms
colnames(tab3coefM)=colNms

tab3stderrM=rbind(tab3col4stderr,tab3col5stderr)
rownames(tab3stderrM)=rowNms
colnames(tab3stderrM)=colNms

## Table-3 tstatM
tab3tstatM=round(abs(tab3coefM)/tab3stderrM,3)

## > tab3tstatM
##          HS 5-6 HS 7-10 HS 11-14 otherPreSch 5-6 otherPreSch 7-10 otherPreSch 11-14
## tab3col4  1.506   0.193    0.475           1.214            0.508             0.606
## tab3col5  1.706   2.217    0.887           0.929            0.738             0.319
##
## Since the formula based shrinkage estimators for
## these models were all negative or less than 0.1
## (Note that the only positive values were for the
## col4, HS+Pre+fixed effects model.





##%%% Collect tstats and shrinkage for Overall results in  Table.4 %%%%%%%%%%%%%%%%%
##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
tabOverAll=rbind(tab4coefM[1,],tab4stderrM[1,],abs(tab4tstatM[1,]))
rownames(tabOverAll)=c("Coef.","std","|tstat|")

## > tabOverAll
## > tabOverAll
##           5-6  7-10 11-14  5-14 Nontest Score Long Term
## Coef.   0.145 0.133 0.055 0.101         0.265     0.228
## std     0.085 0.060 0.062 0.057         0.082     0.072
## |tstat| 1.706 2.217 0.887 1.772         3.232     3.167


## Here we use the formula based shrinkage estimates for all the 
## test score elements.
tabShrinkAll=rsqShrinkA[,c("Tab3col5","Tab3col5","Tab3col5","Tab3col5","Fit2","Fit1")]

## Drop adj.r.squared
tabShrinkAll=tabShrinkAll[c(1,3:6),]

## Define minShrTstat, avgShrTstat
minShrTstat=function(shrVec){
	shrN=max(shrVec[!is.na(shrVec)])
	ouT=0
	if(shrN>0) ouT=sqrt(shrN)
	ouT}

avgShrTstat=function(shrVec){
	shrN=mean(shrVec[!is.na(shrVec)])
	ouT=0
	if(shrN>0) ouT=sqrt(shrN)
	ouT}

shrTstatM=matrix(0,4,ncol(tabShrinkAll))
rownames(shrTstatM)=c("min.shr.tstat","avg.shr.tstat","tstat*min.shr.tstat","tstat*avg.shr.tstat") 

for(i in 1:ncol(shrTstatM)){
	tstatVec=tabOverAll["|tstat|",]
	for(i in 1:ncol(tabOverAll)){
		tstaT=tstatVec[i]
		shrVec=tabShrinkAll[2:5,i]
		shrTstatM[1,i]=minShrTstat(shrVec)
		shrTstatM[2,i]=avgShrTstat(shrVec)
		shrTstatM[3,i]=minShrTstat(shrVec)*tstaT
		shrTstatM[4,i]=avgShrTstat(shrVec)*tstaT
		}
	}


##%%%%%%%%%%% Collect all this in a single matrix %%%%%%%%%

demingTstatOverallM=rbind(tabOverAll,tabShrinkAll,shrTstatM)

## > round(demingTstatOverallM,2)
##                       5-6  7-10 11-14  5-14 Nontest Score Long Term
## Coef.                0.14  0.13  0.06  0.10          0.26      0.23
## std                  0.08  0.06  0.06  0.06          0.08      0.07
## |tstat|              1.71  2.22  0.89  1.77          3.23      3.17
## r.squared            0.62  0.62  0.62  0.62          0.58      0.66
## shr.CV                 NA    NA    NA    NA         -0.44     -0.06
## shr.LN              -0.01 -0.01 -0.01 -0.01         -0.19      0.14
## shr.DS              -0.43 -0.43 -0.43 -0.43         -0.68     -0.22
## shr.R               -0.01 -0.01 -0.01 -0.01         -0.18      0.14
## min.shr.tstat        0.00  0.00  0.00  0.00          0.00      0.38
## avg.shr.tstat        0.00  0.00  0.00  0.00          0.00      0.04
## tstat*min.shr.tstat  0.00  0.00  0.00  0.00          0.00      1.19
## tstat*avg.shr.tstat  0.00  0.00  0.00  0.00          0.00      0.12

## Get rid of the average tstat shrinkage
demingTstatOverallM=rbind(tabOverAll,tabShrinkAll,shrTstatM[c(1,3),])

## Write this out and note that n=1115, p=502 except in some
## of the partition models where the addtional dummy variables
## add some variables but don't increase R^2 => overfitting will be
## worse.

# write.csv(demingTstatOverallM,"demingTstatOverallM.csv")

# write.csv(tab4tstatM,"tab4tstatM.csv")


